This notebook reports the characterisation of molecular pathways associated with the signature idenitfied from cluster 2 of cancer genomes. See /Volumes/cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer_v2_1.Rmd.

1 Download, formatting and supporting files

1.1 Color pallettes

Define colors for cancer primary sites

setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load information about analysed cancer primary sites
cancer.project.info.df <- read.csv("./Dataset/ICGC/projects_2023_10_30_04_53_06.tsv", sep = "\t") %>% separate(Project.Code, c("cancer.type", "project"), sep = "-") %>% dplyr::select(cancer.type, Primary.Site)
# Add considered donor information
snvs.dens.ratio.df <- readRDS("./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
snvs.dens.ratio.df <- snvs.dens.ratio.df %>% left_join(cancer.project.info.df, by = "cancer.type")
saveRDS(snvs.dens.ratio.df, "./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
donors.color.df <- snvs.dens.ratio.df %>% dplyr::select(donor, cancer.type) %>% left_join(cancer.project.info.df, by = "cancer.type")
# Define color palette
length(unique(donors.color.df$Primary.Site)) # 14 colours needed
primary.site.color.palette <- c("#662483", "#29235C", "#1D71B8", "#006633", "#B17F4A", "#DEDC00", "#BE1622",
                                         "#2FAC66", "#F9B233", "#EA83B3", "#9D9D9C", "#E94E1B", "#683C11", "#A084BC")
primary.site.color.palette.df <- cbind.data.frame(Primary.Site = unique(donors.color.df$Primary.Site), color = primary.site.color.palette)
donors.color.df <- donors.color.df %>% left_join(primary.site.color.palette.df, by = "Primary.Site")
saveRDS(donors.color.df, "./003_Transcriptomics_cluster_analysis/rds/donors.color.df.rds")

1.2 Gene of interest

For the rest on the analysis, we consider only protein coding genes

WARNING: ICGC is using either gene name and ensembl gene id (compact or with the id version) that complicates the analysis

Gene information is recovered from ensembl

# r
library(dplyr)
library(biomaRt)
# Recover protein coding gene annotation and informations from Biomart
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
att <- listAttributes(mart)
hsapiens.coding.genes <-  getBM(attributes=c("ensembl_gene_id", "hgnc_symbol", "entrezgene_id"), filters='biotype', values=c('protein_coding'), mart=mart) # 23,222 genes
hsapiens.coding.genes <- hsapiens.coding.genes %>% dplyr::select(gene.id = ensembl_gene_id, gene.symbol = hgnc_symbol, ncbi.gene.id = entrezgene_id)
# Add both gene id and name in the same column
GOI.df <- c(hsapiens.coding.genes$gene.id, hsapiens.coding.genes$gene.symbol)
# Save results
saveRDS(hsapiens.coding.genes, "./003_Transcriptomics_cluster_analysis/rds/hsapiens.coding.genes.rds")
write.table(as.data.frame(GOI.df), "./003_Transcriptomics_cluster_analysis/GOI.txt", col.names = F, row.names = F, quote = F)

1.3 Donors of interest

We consider all available donors with enough mutation to compute snvs density ratio

# Load snvs density ratio df
snvs.dens.ratio.df <- readRDS("./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
write.table(as.data.frame(snvs.dens.ratio.df$donor), "./003_Transcriptomics_cluster_analysis/donor.txt", col.names = F, row.names = F, quote = F) # 1,826 donors

The expression data for all donors is downloaded (Data repositories > select all donors > Sequencing-based Gene Expression 3.62 GB of data) from the ICGC to generate /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.tsv.gz

The downloaded files are formatted and filtered for genes of interest:

WARNING #1: Genes may be referred to using either their ensembl gene id or symbol WARNING #2: ensembl gene id can reports versions or not

# bash
# Recover useful columns
zcat /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.tsv.gz | awk '{print $1,$8, $10}' | gzip -c \
  > /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.tsv.gz
# Remove gene.id version
zcat /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.tsv.gz | awk '{sub(/\..*$/, "", $2)} 1' | gzip -c \
  > /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.mod.tsv.gz
# Select GOI
zcat  /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.mod.tsv.gz | 
grep -wFf /cephfs2/pmurat/OriCan/003_Transcriptomics_cluster_analysis/GOI.txt | gzip -c \
  > /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.GOI.tsv.gz
# Delete temporary files
rm -f /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.tsv.gz
rm -f /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.mod.tsv.gz
# Count entries
wc -l /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.GOI.tsv.gz # 3,374,583 entries

We then consider cancer types independently.

projects.info.df <- read.csv("./Dataset/ICGC/projects_2023_10_30_04_53_06.tsv", sep = "\t")

Cancer projets with transcriptomics data:

MALY-DE: Malignant Lymphoma, Germinal center B-cell derived lymphomas PACA-(AU, CA): Pancreatic cancer, Ductal adenocarcinoma BRCA-US: Breast cancer, Ductal & lobular PRAD-(CA, US, UK, CN, FR): Prostate cancer, Adenocarcinoma LIRI-JP: Liver cancer, Hepatocellular carcinoma (Virus associated) THCA-US, HNSC-US and ORCA-IN: Head and Neck Squamous Cell Carcinoma, Head and Neck Thyroid Carcinoma, Oral Cancer OV-(US, AU): Ovary cancer, Ovarian Serous Cystadenocarcinoma KIRC-US, KIRP-US, RECA-EU: Kidney cancer, Kidney Renal Clear Cell Carcinoma, Kidney Renal Papillary Cell Carcinoma , Renal Cell Cancer

1.4 Gene expression data formatting

Sequencing-based Gene Expression data report both the normalized_read_count and raw_read_count. The definition of normalized_read_count is unclear (see https://bioinformatics.stackexchange.com/questions/3247/what-is-the-icgc-normalized-read-count for a discussion). We then consider the raw counts (column 10) and apply the appropriate corrections.

For each donor, we add information about mutation density ratio (origin/±10kb flank) values for subsequent analysis.

# r
# SLURM
library(dplyr)
library(tidyr)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load snvs density ratio df
snvs.dens.ratio.df <- readRDS("./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
snvs.dens.ratio.light.df <- snvs.dens.ratio.df %>% dplyr::select(donor, snvs.dens.ratio)

# Load gene information
hsapiens.coding.genes <- readRDS("./003_Transcriptomics_cluster_analysis/rds/hsapiens.coding.genes.rds")

# Load transcriptomics data
ICGC.transc.df <- read.table(gzfile("./Dataset/ICGC/transcriptome/exp_seq_all_donors.GOI.tsv.gz"), sep = " ")
colnames(ICGC.transc.df) <- c("donor", "gene", "raw.count")

# Convert gene.id into gene.symbol
# Prepare lookup table
gene.lookup.df <- cbind.data.frame(gene.id = unique(ICGC.transc.df$gene)) %>% left_join((hsapiens.coding.genes %>% dplyr::select(gene.id, gene.symbol)), by = "gene.id") %>%
  mutate(gene = case_when(gene.symbol != "NA" ~ gene.symbol, T ~ gene.id)) %>% filter(gene != "") %>% dplyr::select(gene.id, gene)
# Swap column names
colnames(gene.lookup.df) <- c("gene", "gene.id")

# Join
ICGC.transc.df <- ICGC.transc.df %>% left_join(gene.lookup.df, by = "gene")

# Add snvs density ratio information and drop donors without snvs details
ICGC.transc.df <- ICGC.transc.df %>% left_join(snvs.dens.ratio.light.df, by = "donor") %>% dplyr::select(-gene) %>% drop_na() # 813 donors
saveRDS(ICGC.transc.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.df.rds")

# Add gene length information
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
hg19.exons <- exonsBy(txdb, "gene")
hg19.exon.length <- width(hg19.exons)
hg19.gene.length.df <- sum(hg19.exon.length) %>% tibble::enframe(name = "gene", value = "exon.length")
# map the Entrez ID to gene symbol
gene.conversion.df <- AnnotationDbi::select(org.Hs.eg.db, keys=hg19.gene.length.df$gene, columns="SYMBOL", keytype="ENTREZID")
all.equal(hg19.gene.length.df$gene, gene.conversion.df$ENTREZID) # entries are unique
hg19.gene.length.df$gene.id <- gene.conversion.df$SYMBOL
hg19.gene.length.df <- hg19.gene.length.df %>% dplyr::select(-gene)
# Join
ICGC.transc.TPM.df <- ICGC.transc.df %>% left_join(hg19.gene.length.df, by = "gene.id")

# Compute gene expression in TPM
ICGC.transc.TPM.df <- ICGC.transc.TPM.df %>% group_by(donor, gene.id) %>%
  summarise(raw.count = mean(raw.count, na.rm = T), snvs.dens.ratio = mean(snvs.dens.ratio, na.rm = T), exon.length = mean(exon.length, na.rm = T)) %>% mutate(reads.per.rpk = raw.count/exon.length, TPM = (reads.per.rpk*1000000)/sum(reads.per.rpk, na.rm = T)) %>% dplyr::select(-exon.length, -reads.per.rpk) %>% drop_na()
saveRDS(ICGC.transc.TPM.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")

# Reload table
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")

2 PACA tumours analysis

We focus on a first time on pancreatic cancers (PACA-(AU, CA): Pancreatic cancer, Ductal adenocarcinoma) as they displays a wide distribution of mutation density ratio (origin/±10kb flank) values.

To identify mutagenic pathway associated with mutagenesis at origins, we cluster PACA tumors based on their transcriptional profile and characterise dysreguated pathways.

2.0.1 UMAP clustering

A Uniform Manifold Approximation and Projection (UMAP) approach is used to assess tumor heterogeneity in term of transcription.

# r
# SLURM and local
library(dplyr)
library(tidyr)
library(umap)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load gene expression data for all donors and cancer types
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")

# Spread df
ICGC.transc.TPM.spread.df <- ICGC.transc.TPM.df %>% dplyr::select(donor, gene.id, TPM) %>% distinct() 
ICGC.transc.TPM.spread.df <- ICGC.transc.TPM.spread.df %>% pivot_wider(names_from = gene.id, values_from = TPM, values_fn = first)
ICGC.transc.TPM.spread.df <- as.data.frame(ICGC.transc.TPM.spread.df)
row.names(ICGC.transc.TPM.spread.df) <- ICGC.transc.TPM.spread.df$donor
ICGC.transc.TPM.spread.df <- ICGC.transc.TPM.spread.df[,-1]
ICGC.transc.TPM.spread.df[is.na(ICGC.transc.TPM.spread.df)] <- 0
saveRDS(ICGC.transc.TPM.spread.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.spread.df.rds")
dim(ICGC.transc.TPM.spread.df) # 813 donors, 19298 genes

# Reload data and perform UMAP
# The random_state argument of the umap function is to define the original random state of the analysis (equivalent to set.seed)

# from the cluster
ICGC.transc.TPM.spread.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.spread.df.rds")
ICGC.transc.TPM.umap.df <- umap(ICGC.transc.TPM.spread.df, n_components = 2, random_state = 50) 
saveRDS(ICGC.transc.TPM.umap.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.df.rds")

# Reopen locally
ICGC.transc.TPM.umap.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.df.rds")

# Format UMAP
umap.layout <- ICGC.transc.TPM.umap.df[["layout"]] 
umap.layout <- data.frame(umap.layout) 
ICGC.transc.TPM.umap.final.df <- cbind(umap.layout, donor = rownames(ICGC.transc.TPM.spread.df)) 

# Add primary site information
ICGC.transc.TPM.umap.final.df <- ICGC.transc.TPM.umap.final.df %>% left_join(donors.color.df, by = "donor") %>% distinct()
saveRDS(ICGC.transc.TPM.umap.final.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.final.df.rds")

# We observe two independent clusters for blood and pancreatic cancer

# Blood clusters are made from MALY and CCLE - not considered for now

# Rerun UMAP analysis on pancreatic cancer

donor.info.df <- read.csv("./Dataset/ICGC/donor.all_projects.tsv", sep = "\t")

# We focus on two projects: PACA-AU and PACA-CA: Pancreatic cancer, Ductal adenocarcinoma.

# Pancreas (PACA)

PACA.cluster.id <- (donor.info.df %>% filter(project_code == "PACA-AU" | project_code == "PACA-CA"))$icgc_donor_id # 778 donors
ICGC.transc.TPM.spread.PACA.df <- ICGC.transc.TPM.spread.df[which(rownames(ICGC.transc.TPM.spread.df) %in% PACA.cluster.id),] # 194 donors

ICGC.transc.TPM.PACA.umap.df <- umap(ICGC.transc.TPM.spread.PACA.df, n_components = 2, random_state = 20)
umap.PACA.layout <- ICGC.transc.TPM.PACA.umap.df[["layout"]] 
umap.PACA.layout <- data.frame(umap.PACA.layout) 
ICGC.transc.TPM.umap.PACA.final.df <- cbind(umap.PACA.layout, donor = rownames(ICGC.transc.TPM.spread.PACA.df)) 
saveRDS(ICGC.transc.TPM.umap.PACA.final.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.PACA.final.df.rds")

# Select PACA clusters

PACA.cluster.1.df <- ICGC.transc.TPM.umap.PACA.final.df %>% filter(X1 >= 1 & X2 >= 0) %>% mutate(cluster = "clust.1") # 34 donors
PACA.cluster.2.df <- ICGC.transc.TPM.umap.PACA.final.df %>% filter(X1 <= 1 & X2 >= 0) %>% mutate(cluster = "clust.2") # 33 donors
PACA.cluster.3.df <- ICGC.transc.TPM.umap.PACA.final.df %>% filter(X2 <= 0) %>% mutate(cluster = "clust.3") # 127 donors
PACA.clusters.summary.df <- rbind(PACA.cluster.1.df, PACA.cluster.2.df, PACA.cluster.3.df) %>% dplyr::select(donor, cluster) %>%
  left_join(snvs.dens.ratio.df, by = "donor") %>% mutate(cancer.project = paste(cancer.type, cancer.project, sep = "-"))
saveRDS(PACA.clusters.summary.df, file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")

Plot UMAP and PACA cluster characterisation

library(dplyr)
library(ggplot2)
library(scales)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load data
ICGC.transc.TPM.umap.final.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.final.df.rds")
ICGC.transc.TPM.umap.PACA.final.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.PACA.final.df.rds")
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")

# For all cancer types

ICGC.transc.TPM.umap.final.plot <- ICGC.transc.TPM.umap.final.df %>% ggplot(aes(x = X1, y = X2)) +
  geom_point(aes(color = color), shape = 16, alpha = 0.6) + 
  scale_color_identity(name = "Sites", breaks = ICGC.transc.TPM.umap.final.df$color, labels = ICGC.transc.TPM.umap.final.df$Primary.Site, guide = "legend") +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("UMAP") +
  theme_bw() + theme(aspect.ratio=1)
ICGC.transc.TPM.umap.final.plot

# For PACA project (colored by clusters)

ICGC.transc.TPM.umap.PACA.final.plot <- ICGC.transc.TPM.umap.PACA.final.df %>%
  mutate(cluster = case_when(X1 >= 1 & X2 >= 0 ~ "cluster 1",
                             X1 <= 1 & X2 >= 0 ~ "cluster 2",
                             X2 <= 0 ~ "cluster 3")) %>% 
  ggplot(aes(x = X1, y = X2, color = cluster)) +
  geom_point(shape = 16) + 
  scale_color_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("UMAP - PACA cancers") +
  theme_bw() + theme(aspect.ratio=1)
ICGC.transc.TPM.umap.PACA.final.plot

pdf("./Rplots/ICGC.transc.TPM.umap.final.plot.pdf", width=7, height=6, useDingbats=FALSE)
ICGC.transc.TPM.umap.final.plot
dev.off()
## quartz_off_screen 
##                 2
pdf("./Rplots/ICGC.transc.TPM.umap.PACA.final.plot.pdf", width=7, height=6, useDingbats=FALSE)
ICGC.transc.TPM.umap.PACA.final.plot
dev.off()
## quartz_off_screen 
##                 2

2.1 PACA clusters characterisation

Characterise mutational burden of PACA clusters.

library(dplyr)
library(ggplot2)
library(scales)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load data
ICGC.transc.TPM.umap.final.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.final.df.rds")
ICGC.transc.TPM.umap.PACA.final.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.PACA.final.df.rds")
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")

# snvs density ratio

PACA.clusters.snvs.plot <- PACA.clusters.summary.df %>% ggplot(aes(x=cluster, y=snvs.dens.ratio)) +
  geom_boxplot(aes(fill = cluster), alpha = 0.5) + geom_jitter(alpha = 0.2) +
  scale_fill_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) + ylab("Mutation density ratio (ori/flanks)") +
  ggtitle("Pval = 1.643e-06 & 9.303e-09") + ylim(0,6.5) +
  theme_bw() + theme(aspect.ratio = 2, axis.title.x=element_blank(), legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))
PACA.clusters.snvs.plot

pdf("./Rplots/PACA.clusters.snvs.plot.pdf", width=7, height=6, useDingbats=FALSE)
PACA.clusters.snvs.plot
dev.off()
## quartz_off_screen 
##                 2
# Total number of snvs

PACA.clusters.snvs.total.plot <- PACA.clusters.summary.df %>% ggplot(aes(x=cluster, y=snvs.count)) +
  geom_boxplot(aes(fill = cluster), alpha = 0.5) + geom_jitter(alpha = 0.2) +
  scale_fill_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) + ylab("Genome-wide snvs count") +
  scale_y_continuous(trans = log10_trans(),
                     breaks = trans_breaks("log10", function(x) 10^x),
                     labels = trans_format("log10", math_format(10^.x)), limits = c(2000,200000)) +
  ggtitle("Pval = 0.3795 & 0.1115") +
  theme_bw() + theme(aspect.ratio = 2, axis.title.x=element_blank(), legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))
PACA.clusters.snvs.total.plot

pdf("./Rplots/PACA.clusters.snvs.total.plot.pdf", width=7, height=6, useDingbats=FALSE)
PACA.clusters.snvs.total.plot
dev.off()
## quartz_off_screen 
##                 2
# Compute stats

ks.test(PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.1"),]$snvs.dens.ratio,
        PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"),]$snvs.dens.ratio) # p-value = 1.643e-06
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.1"), ]$snvs.dens.ratio and PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"), ]$snvs.dens.ratio
## D = 0.37344, p-value = 1.643e-06
## alternative hypothesis: two-sided
ks.test(PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"),]$snvs.dens.ratio,
        PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.3"),]$snvs.dens.ratio) # p-value = 9.303e-09
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"), ]$snvs.dens.ratio and PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.3"), ]$snvs.dens.ratio
## D = 0.34884, p-value = 9.303e-09
## alternative hypothesis: two-sided
ks.test(PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.1"),]$snvs.count,
        PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"),]$snvs.count) # p-value = 0.3795
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.1"), ]$snvs.count and PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"), ]$snvs.count
## D = 0.12834, p-value = 0.3795
## alternative hypothesis: two-sided
ks.test(PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"),]$snvs.count,
        PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.3"),]$snvs.count) # p-value = 0.1115
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"), ]$snvs.count and PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.3"), ]$snvs.count
## D = 0.13531, p-value = 0.1115
## alternative hypothesis: two-sided

2.2 PACA clusters mutational signatures

In this section, we compute the mutational signatures at origins (as previously done) for clustered PACA tumours.

# r
library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load cluster information
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")

# Recover donor id
PACA.cluster.1.id <- PACA.cluster.1.df$donor # 34 genomes
PACA.cluster.2.id <- PACA.cluster.2.df$donor # 33 genomes
PACA.cluster.3.id <- PACA.cluster.3.df$donor # 127 genomes

# Load mutational differential matrix
snvs.diff.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")

# Subset donors
PACA.cluster.1.id <- PACA.cluster.1.id[PACA.cluster.1.id %in% colnames(as.data.frame(snvs.diff.mut.mat))] # 5
PACA.cluster.2.id <- PACA.cluster.2.id[PACA.cluster.2.id %in% colnames(as.data.frame(snvs.diff.mut.mat))] # 2
PACA.cluster.3.id <- PACA.cluster.3.id[PACA.cluster.3.id %in% colnames(as.data.frame(snvs.diff.mut.mat))] # 11

# Recover cluster specific profiles
PACA.cluster.1.mat <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(PACA.cluster.1.id)) %>% mutate(PACA.1 = rowSums(.)) %>% dplyr::select(PACA.1)
PACA.cluster.2.mat <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(PACA.cluster.2.id)) %>% mutate(PACA.2 = rowSums(.)) %>% dplyr::select(PACA.2)
PACA.cluster.3.mat <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(PACA.cluster.3.id)) %>% mutate(PACA.3 = rowSums(.)) %>% dplyr::select(PACA.3)

# Combine information
PACA.cluster.mat <- cbind.data.frame(PACA.cluster.1.mat, PACA.cluster.2.mat, PACA.cluster.3.mat)
PACA.cluster.mat <- as.matrix(PACA.cluster.mat)
saveRDS(PACA.cluster.mat, "./003_Transcriptomics_cluster_analysis/rds/PACA.cluster.mat.rds")

# Plot profiles
plot_96_profile(PACA.cluster.mat, condensed = TRUE, ymax = 0.4)

Plot

# r
library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")

PACA.cluster.mat <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.cluster.mat.rds")

# Plot profiles
plot_96_profile(PACA.cluster.mat, condensed = TRUE, ymax = 0.4)

pdf("./Rplots/PACA.cluster.mut.sig.profile.pdf", width=5, height=4, useDingbats=FALSE)
plot_96_profile(PACA.cluster.mat, condensed = TRUE, ymax = 0.4)
dev.off()
## quartz_off_screen 
##                 2
# Compute cosine similarity with cluster 2 SBS

# Load cluster mutation matrix
snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")

# Compute cosine matrix
snvs.ori.PACA.sim.mat <- cos_sim_matrix(PACA.cluster.mat, snvs.ori.mut.cluster)
snvs.ori.PACA.sim.mat
##        cluster.1 cluster.2 cluster.3 cluster.4 cluster.5
## PACA.1 0.1286583 0.9731505 0.2427472 0.1377507 0.3576658
## PACA.2 0.1042018 0.9059249 0.3786150 0.1214611 0.3819263
## PACA.3 0.3094851 0.6913128 0.4125451 0.3720286 0.8004612

PACA signatures have high cosine similarity with the previously identifed cluster 2 SBS.

2.3 PACA clusters signature local exposure

In this section, we compute the local exposure of cluster 2 SBS at origins (as previously done) for clustered PACA tumours.

2.3.1 Compute mutation count matrices

Mutations from genomes of each cluster are combined and spatially resolved in bins of 100 nt around origins. The extracted signatures are then fitted to the mutation count matrices in order to characterise their spatial contribution.

Combine vcf files

# PACA 1
# Recover donor id, file names and paths
PACA.1.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% PACA.cluster.1.id))$file.name # 5 genomes
PACA.1.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", PACA.1.vcf.name, sep = "")
# Combine vcf files
PACA.1.snvs.df <- tibble()
for (i in 1:length(PACA.1.vcf.path)) {
  print(i/length(PACA.1.vcf.path))
  vcf.i <- read.table(gzfile(PACA.1.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
  PACA.1.snvs.df <- rbind(PACA.1.snvs.df, vcf.i)
}
write.table(PACA.1.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)

# PACA 2
# Recover donor id, file names and paths
PACA.2.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% PACA.cluster.2.id))$file.name # 2 genomes
PACA.2.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", PACA.2.vcf.name, sep = "")
# Combine vcf files
PACA.2.snvs.df <- tibble()
for (i in 1:length(PACA.2.vcf.path)) {
  print(i/length(PACA.2.vcf.path))
  vcf.i <- read.table(gzfile(PACA.2.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
  PACA.2.snvs.df <- rbind(PACA.2.snvs.df, vcf.i)
}
write.table(PACA.2.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)

# PACA 3
# Recover donor id, file names and paths
PACA.3.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% PACA.cluster.3.id))$file.name # 11 genomes
PACA.3.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", PACA.3.vcf.name, sep = "")
# Combine vcf files
PACA.3.snvs.df <- tibble()
for (i in 1:length(PACA.3.vcf.path)) {
  print(i/length(PACA.3.vcf.path))
  vcf.i <- read.table(gzfile(PACA.3.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
  PACA.3.snvs.df <- rbind(PACA.3.snvs.df, vcf.i)
}
write.table(PACA.3.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)

Compute snvs-origin distances and filter mutations at less than 10kb of an origin

# bash/SLURM

# PACA 1
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.closest.bed

# PACA 2
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.closest.bed

# PACA 3
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.closest.bed

Split bed files according to origin distances

# r

# PACA 1
PACA.1.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.10kb.bed", sep = "\t")
PACA.1.snvs.10kb.df <- PACA.1.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(PACA.1.snvs.10kb.df$dist)) {
  PACA.1.vcf <- PACA.1.snvs.10kb.df %>% filter(dist == i) %>%
    mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
  colnames(PACA.1.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
  vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_1/Bin_",i,".bed")
  write.table(PACA.1.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}

# PACA 2
PACA.2.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.10kb.bed", sep = "\t")
PACA.2.snvs.10kb.df <- PACA.2.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(PACA.2.snvs.10kb.df$dist)) {
  PACA.2.vcf <- PACA.2.snvs.10kb.df %>% filter(dist == i) %>%
    mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
  colnames(PACA.2.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
  vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_2/Bin_",i,".bed")
  write.table(PACA.2.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}

# PACA 3
PACA.3.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.10kb.bed", sep = "\t")
PACA.3.snvs.10kb.df <- PACA.3.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(PACA.3.snvs.10kb.df$dist)) {
  PACA.3.vcf <- PACA.3.snvs.10kb.df %>% filter(dist == i) %>%
    mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
    dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
  colnames(PACA.3.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
  vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_3/Bin_",i,".bed")
  write.table(PACA.3.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}

Convert to vcf format

# bash/SLURM

# PACA 1
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_1/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_1/$file.bed >  /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_1/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_1

# PACA 2
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_2/*.bed
do
path=${filename%.*}
file=${path##*/}
  echo $file
  awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_2/$file.bed >  /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_2/$file.vcf
  done
  rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_2

# PACA 3
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_3/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_3/$file.bed >  /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_3/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_3

Extract mutational count matrices

# r
library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load genome
ref.genome <- BSgenome.Hsapiens.UCSC.hg19

# PACA 1
vcfFiles.PACA.1 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_1/"), value = TRUE)
PACA.1.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_1/", vcfFiles.PACA.1)
PACA.1.sample.names <- vcfFiles.PACA.1 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
PACA.1.snvs.grl <- read_vcfs_as_granges(PACA.1.files, PACA.1.sample.names, ref.genome)
saveRDS(PACA.1.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.1.snvs.grl.rds")

# PACA 2
vcfFiles.PACA.2 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_2/"), value = TRUE)
PACA.2.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_2/", vcfFiles.PACA.2)
PACA.2.sample.names <- vcfFiles.PACA.2 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
PACA.2.snvs.grl <- read_vcfs_as_granges(PACA.2.files, PACA.2.sample.names, ref.genome)
saveRDS(PACA.2.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.2.snvs.grl.rds")

# PACA 3
vcfFiles.PACA.3 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_3/"), value = TRUE)
PACA.3.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_3/", vcfFiles.PACA.3)
PACA.3.sample.names <- vcfFiles.PACA.3 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
PACA.3.snvs.grl <- read_vcfs_as_granges(PACA.3.files, PACA.3.sample.names, ref.genome)
saveRDS(PACA.3.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.3.snvs.grl.rds")

# Load grange lists
PACA.1.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/PACA.1.snvs.grl.rds")
PACA.2.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/PACA.2.snvs.grl.rds")
PACA.3.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/PACA.3.snvs.grl.rds")

# Compute mutation count matrices
PACA.1.mut.mat <- mut_matrix(vcf_list = PACA.1.snvs.grl, ref_genome = ref.genome)
PACA.2.mut.mat <- mut_matrix(vcf_list = PACA.2.snvs.grl, ref_genome = ref.genome)
PACA.3.mut.mat <- mut_matrix(vcf_list = PACA.3.snvs.grl, ref_genome = ref.genome)

# Save
saveRDS(PACA.1.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.1.mut.mat.rds")
saveRDS(PACA.2.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.2.mut.mat.rds")
saveRDS(PACA.3.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.3.mut.mat.rds")

2.3.2 Plot local exposure

# r

# Load mutation count matrices
cluster.1.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.mut.mat.rds")
cluster.2.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.mut.mat.rds")
cluster.3.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.mut.mat.rds")
cluster.4.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.mut.mat.rds")
cluster.5.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.mut.mat.rds")

# Load identified signatures
snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")

# PACA 1
PACA.1.fit <- fit_to_signatures(PACA.1.mut.mat, snvs.ori.mut.cluster)
PACA.1.fit.df <- as.data.frame(PACA.1.fit$contribution)
PACA.1.fit.df <- as.data.frame(t(PACA.1.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, PACA.1 = cluster.2)
PACA.1.fit.df$dist <- as.numeric(PACA.1.fit.df$dist)
PACA.1.fit.df <- PACA.1.fit.df %>% dplyr::select(dist, value = PACA.1) %>% mutate(cluster = "PACA.1")

# PACA 2
PACA.2.fit <- fit_to_signatures(PACA.2.mut.mat, snvs.ori.mut.cluster)
PACA.2.fit.df <- as.data.frame(PACA.2.fit$contribution)
PACA.2.fit.df <- as.data.frame(t(PACA.2.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, PACA.2 = cluster.2)
PACA.2.fit.df$dist <- as.numeric(PACA.2.fit.df$dist)
PACA.2.fit.df <- PACA.2.fit.df %>% dplyr::select(dist, value = PACA.2) %>% mutate(cluster = "PACA.2")

# PACA 3
PACA.3.fit <- fit_to_signatures(PACA.3.mut.mat, snvs.ori.mut.cluster)
PACA.3.fit.df <- as.data.frame(PACA.3.fit$contribution)
PACA.3.fit.df <- as.data.frame(t(PACA.3.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, PACA.3 = cluster.2)
PACA.3.fit.df$dist <- as.numeric(PACA.3.fit.df$dist)
PACA.3.fit.df <- PACA.3.fit.df %>% dplyr::select(dist, value = PACA.3) %>% mutate(cluster = "PACA.3")

# Combine results and plot

PACA.summary.norm.fit.df <- rbind(PACA.1.norm.fit.df, PACA.2.norm.fit.df, PACA.3.norm.fit.df)
saveRDS(PACA.summary.norm.fit.df, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.summary.norm.fit.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Load results
PACA.summary.norm.fit.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/PACA.summary.norm.fit.df.rds")

# Plot
PACA.summary.norm.fit.plot <- PACA.summary.norm.fit.df %>%
  ggplot(aes(x = dist, y = value, color = cluster)) +
  geom_line() + xlim(-5000,5000) +
  scale_color_manual(values = c("#E4221E", "#F39208", "#504B76")) +
  geom_hline(yintercept=0, linetype="dashed") +
  xlab("Distance from origins (bp)") + ylab("Cluster 2 signature\nrelative contribution") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
PACA.summary.norm.fit.plot

pdf("./Rplots/PACA.summary.norm.fit.plot.pdf", width=10, height=6, useDingbats=FALSE)
PACA.summary.norm.fit.plot
dev.off() 
## quartz_off_screen 
##                 2

2.4 Cluster 2 SBS strand asymetry

Assess the potential strand asymmetry of cluster 2 SBS

# Load cluster 2 snvs gr list
cluster.2.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.2.snvs.grl.rds")

# Split pyrimidine and purine mutations
cluster.2.pyr.grl <- cluster.2.snvs.grl
cluster.2.pur.grl <- cluster.2.snvs.grl
for (i in 1:length(cluster.2.snvs.grl)) {
  print(i/length(cluster.2.snvs.grl))
  gr.i <- cluster.2.snvs.grl[i]
  bin.i <- names(gr.i)
  unlist.gr.i <- unlist(gr.i)
  pyr.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "C" | as.character(unlist.gr.i$REF) == "T"]
  pyr.gr.comp.i <- GRangesList(pyr.gr.i, compress=TRUE)
  cluster.2.pyr.grl[i] <- pyr.gr.comp.i
  pur.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "G" | as.character(unlist.gr.i$REF) == "A"]
  pur.gr.comp.i <- GRangesList(pur.gr.i, compress=TRUE)
  cluster.2.pur.grl[i] <- pur.gr.comp.i
}
saveRDS(cluster.2.pyr.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.pyr.grl.rds")
saveRDS(cluster.2.pur.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.pur.grl.rds")

# Compute mutational matrices
cluster.2.pyr.mat <- mut_matrix(vcf_list = cluster.2.pyr.grl, ref_genome = ref.genome)
cluster.2.pur.mat <- mut_matrix(vcf_list = cluster.2.pur.grl, ref_genome = ref.genome)

# Fit signatures
cluster.2.pyr.fit <- fit_to_signatures(cluster.2.pyr.mat, snvs.ori.mut.cluster)
cluster.2.pur.fit <- fit_to_signatures(cluster.2.pur.mat, snvs.ori.mut.cluster)

# Format
cluster.2.pyr.fit.df <- as.data.frame(cluster.2.pyr.fit$contribution)
cluster.2.pyr.fit.df <- as.data.frame(t(cluster.2.pyr.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, value = cluster.2) %>% mutate(strand = "plus")
cluster.2.pur.fit.df <- as.data.frame(cluster.2.pur.fit$contribution)
cluster.2.pur.fit.df <- as.data.frame(t(cluster.2.pur.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, value = cluster.2) %>% mutate(strand = "minus")
cluster.2.strand.fit.df <- rbind(cluster.2.pyr.fit.df, cluster.2.pur.fit.df) 
cluster.2.strand.fit.df$dist <- as.numeric(cluster.2.strand.fit.df$dist)
saveRDS(cluster.2.strand.fit.df, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.strand.fit.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

cluster.2.strand.fit.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/cluster.2.strand.fit.df.rds")
cluster.2.strand.fit.plot <- cluster.2.strand.fit.df %>% ggplot(aes(x = dist, y = value, color = strand)) +
  geom_line() + xlim(-5000, 5000) +
  scale_color_manual(values = c("#514B76", "#D56114")) +
  xlab("Distance from origin (bp)") + ylab("Absolute SBS\ncluster 2 contribution") +
  theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.strand.fit.plot

pdf("./Rplots/cluster.2.strand.fit.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.2.strand.fit.plot
dev.off()
## quartz_off_screen 
##                 2

3 PACA differential expression analysis

3.1 DEseq

In order to identify the molecular pathway associated with cluster 2 SBS signature we performed a differential gene expression analysis based on the UMAP clustering.

Differential gene expression analysis is performed with DEseq from raw counts.

# r
library(dplyr)
library(DESeq2)
library(RColorBrewer)
library(gplots)
library(scales)

# Load clustering information
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")

# Group donor by cluster
PACA.cluster.1.donor <- unique((PACA.clusters.summary.df %>% filter(cluster == "clust.1"))$donor) # 34 donors
PACA.cluster.2.donor <- unique((PACA.clusters.summary.df %>% filter(cluster == "clust.2"))$donor) # 33 donors
PACA.cluster.3.donor <- unique((PACA.clusters.summary.df %>% filter(cluster == "clust.3"))$donor) # 127 donors

# Prepare raw count matrix
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")
ICGC.transc.PACA.cluster.1.TPM.df <- ICGC.transc.TPM.df %>% filter(donor %in% PACA.cluster.1.donor) %>% mutate(cluster = "PACA.1")
ICGC.transc.PACA.cluster.2.TPM.df <- ICGC.transc.TPM.df %>% filter(donor %in% PACA.cluster.2.donor) %>% mutate(cluster = "PACA.2")
ICGC.transc.PACA.cluster.3.TPM.df <- ICGC.transc.TPM.df %>% filter(donor %in% PACA.cluster.3.donor) %>% mutate(cluster = "PACA.3")
ICGC.transc.PACA.TPM.df <- rbind(ICGC.transc.PACA.cluster.1.TPM.df, ICGC.transc.PACA.cluster.2.TPM.df, ICGC.transc.PACA.cluster.3.TPM.df) # 194 donors
#
ICGC.transc.PACA.spread.df <- ICGC.transc.PACA.TPM.df %>% dplyr::select(donor, gene.id, raw.count) %>% distinct()
ICGC.transc.PACA.spread.df <- ICGC.transc.PACA.spread.df %>% pivot_wider(names_from = donor, values_from = raw.count) %>% as.data.frame()
row.names(ICGC.transc.PACA.spread.df) <- ICGC.transc.PACA.spread.df$gene.id
ICGC.transc.PACA.spread.df <- ICGC.transc.PACA.spread.df[,-1]
cts <- ICGC.transc.PACA.spread.df %>% drop_na()
cts <- cts %>% mutate(across(everything(), round, 1)) %>% mutate(across(everything(), ~as.integer(.)))

# Prepare condition df
coldata <- cbind.data.frame(donor = colnames(ICGC.transc.PACA.spread.df)) %>%
  mutate(cluster = case_when(donor %in% PACA.cluster.1.donor ~ "PACA.1",
                             donor %in% PACA.cluster.2.donor ~ "PACA.2",
                             donor %in% PACA.cluster.3.donor ~ "PACA.3"))
coldata$cluster <- as.factor(coldata$cluster)
rownames(coldata) <- coldata$donor

# Check wehter the columns of the count matrix and the rows of the column data (information about samples) are in the same order. 
all(rownames(coldata) %in% colnames(cts)) # TRUE

# Create DEseq experiement
dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ cluster)
dds

# Pre-filtering - Remove genes with less than 30 observations (as PACA 1 and PACA 2 have around 30 tumours)
smallestGroupSize <- 30
keep <- rowSums(counts(dds) >= 30) >= smallestGroupSize
dds <- dds[keep,] # 14,702 genes, 194 samples

# Differential expression analysis
dds <- DESeq(dds)
resultsNames(dds)
# "Intercept" "cluster_PACA.2_vs_PACA.1" "cluster_PACA.3_vs_PACA.1"

# Assess DEG associated with PACA clusters

# PACA1 vs PACA3
res.PACA1.vs.PACA3 <- results(dds, pAdjustMethod = "fdr", contrast = c("cluster", "PACA.1", "PACA.3"))
res.PACA1.vs.PACA3 <- as.data.frame(res.PACA1.vs.PACA3) %>% mutate(log10.pval = -log10(padj))
res.PACA1.vs.PACA3$gene.id <- row.names(res.PACA1.vs.PACA3)
saveRDS(res.PACA1.vs.PACA3, "./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.3.clust.DEseq.result.rds")

# PACA1 vs PACA2
res.PACA1.vs.PACA2 <- results(dds, pAdjustMethod = "fdr", contrast = c("cluster", "PACA.1", "PACA.2"))
res.PACA1.vs.PACA2 <- as.data.frame(res.PACA1.vs.PACA2) %>% mutate(log10.pval = -log10(padj))
res.PACA1.vs.PACA2$gene.id <- row.names(res.PACA1.vs.PACA2)
saveRDS(res.PACA1.vs.PACA2, "./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.2.clust.DEseq.result.rds")

Visualise result

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

DEseq.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.3.clust.DEseq.result.rds")
# Add rank based on adjusted pvalues
DEseq.df <- DEseq.df %>% mutate(rank = dense_rank(-dplyr::desc(padj))) %>% 
  mutate(class = case_when(rank <= 2000 & log2FoldChange <= 0 ~ "down",
                           rank <= 2000 & log2FoldChange >= 0 ~ "up",
                           T ~ "non")) %>% arrange(-padj)
# Plot
DEseq.plot <- DEseq.df %>%
  ggplot(aes(x=baseMean, y=log2FoldChange, colour=class)) +
  geom_point() + scale_x_continuous(trans = 'log10') +
  scale_color_manual(values = c("#504B76", "#D4D2D2", "#E4221E")) +
  geom_hline(yintercept=0, linetype="dashed") +
  xlab("Mean expression (baseMean - CPM)") + ylab("log2FC expression (PACA1 vs PACA3)") + 
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none")
DEseq.plot

pdf("./Rplots/PACA.DEseq.plot.pdf", width=7, height=6, useDingbats=FALSE)
DEseq.plot
dev.off()
## quartz_off_screen 
##                 2

3.2 Gene set enrichment

Gene set enrichment analysis from DESeq result fgsea is the R version of the software developed by the Broad institute Gene sets are from the Molecular Signatures Database (MSigDB) and downloaded in a R format from https://bioinf.wehi.edu.au/MSigDB/v7.1/ (v7)

# r

# Load DEseq results
DE.PACA1.vs.PACA3.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.3.clust.DEseq.result.rds")
DE.PACA1.vs.PACA2.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.2.clust.DEseq.result.rds")

# Convert gene.id to NCBI gene ID
hsapiens.coding.genes <- readRDS("./003_Transcriptomics_cluster_analysis/rds/hsapiens.coding.genes.rds")
hsapiens.coding.genes.2 <- hsapiens.coding.genes %>% dplyr::select(gene.id = gene.symbol, ncbi.gene.id)
DE.PACA1.vs.PACA3.gsea.df <- DE.PACA1.vs.PACA3.df %>% left_join(hsapiens.coding.genes.2, by = "gene.id")
DE.PACA1.vs.PACA2.gsea.df <- DE.PACA1.vs.PACA2.df %>% left_join(hsapiens.coding.genes.2, by = "gene.id")

# Create ranks based on the DESeq Wald statistic (log2FoldChange/lfcSE)
PACA1.vs.PACA3.gseaDat.ranks <- DE.PACA1.vs.PACA3.gsea.df$stat
  names(PACA1.vs.PACA3.gseaDat.ranks) <- DE.PACA1.vs.PACA3.gsea.df$ncbi.gene.id
PACA1.vs.PACA2.gseaDat.ranks <- DE.PACA1.vs.PACA2.gsea.df$stat
  names(PACA1.vs.PACA2.gseaDat.ranks) <- DE.PACA1.vs.PACA2.gsea.df$ncbi.gene.id

# Load KEGG pathways
pathwaysKEGG <- readRDS("./Dataset/GSEA/Hs.c2.cp.kegg.v7.1.entrez.rds")

# Perform gsea
PACA1.vs.PACA3.fgseaKEGG <- fgsea(pathwaysKEGG, PACA1.vs.PACA3.gseaDat.ranks, minSize=15, maxSize = 500, nperm=1000)
PACA1.vs.PACA2.fgseaKEGG <- fgsea(pathwaysKEGG, PACA1.vs.PACA2.gseaDat.ranks, minSize=15, maxSize = 500, nperm=1000)

# Merge results
PACA1.vs.PACA3.fgseaKEGG.df <- PACA1.vs.PACA3.fgseaKEGG %>% dplyr::select(pathway, pval.PACA1.vs.PACA3 = pval, NES.PACA1.vs.PACA3 = NES, size, leadingEdge)
PACA1.vs.PACA2.fgseaKEGG.df <- PACA1.vs.PACA2.fgseaKEGG %>% dplyr::select(pathway, pval.PACA1.vs.PACA2 = pval, NES.PACA1.vs.PACA2 = NES)
fgseaKEGG.df <- PACA1.vs.PACA3.fgseaKEGG.df %>% left_join(PACA1.vs.PACA2.fgseaKEGG.df, by = "pathway") %>% mutate(NES = (NES.PACA1.vs.PACA3+NES.PACA1.vs.PACA2)/2)

# Combine pvalues according to the fisher method
fishersMethod <- function(x) pchisq(-2 * sum(log(x)),df=2*length(x),lower=FALSE)
pval <- vector()
for (i in 1:nrow(fgseaKEGG.df)) {
  pval.i <- c(fgseaKEGG.df[i,]$pval.PACA1.vs.PACA3, fgseaKEGG.df[i,]$pval.PACA1.vs.PACA2)
  pval.fisher.i <- fishersMethod(pval.i)
  pval <- append(pval, pval.fisher.i)
}
fgseaKEGG.df$pval = pval
saveRDS(fgseaKEGG.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.fgseaKEGG.df.rds")

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

fgseaKEGG.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.fgseaKEGG.df.rds")

# Ranked pathways
gseaDat.ranks.df <- cbind.data.frame(pathway = fgseaKEGG.df$pathway, NES = fgseaKEGG.df$NES) %>% mutate(rank = dense_rank(dplyr::desc(-NES)))

# Select pathway to highlight

# DNA repair and replication
DNA.repair.replication.keys <- c("REPAIR", "REPLICATION", "RECOMBINATION", "CELL_CYCLE")
DNA.repair.replication.pathway <- unique(grep(paste(DNA.repair.replication.keys, collapse="|"), gseaDat.ranks.df$pathway, value=TRUE))

# Glucose metabolism
Glucose.metabolism.keys <- c("FRUCTOSE", "GALACTOSE", "ASCORBATE", "PURINE", "PYRIMIDINE", "SUCROSE", "PYRUVATE", "GLUCO", "PENTOSE", "CITRATE", "NUCLEOTIDE")
Glucose.metabolism.pathway <- unique(grep(paste(Glucose.metabolism.keys, collapse="|"), gseaDat.ranks.df$pathway, value=TRUE))

# Glucose and cell proliferation
Cancer.keys <- c("CANCER", "MTOR", "NOTCH", "LEUKEMIA", "P53", "ERBB", "GLIOMA", "MELANOMA", "ADHESION", "APOPTOSIS", "VEGF", "WNT", "MAPK", "CARCINOMA", "JAK", "TGF")
Cancer.pathway <- unique(grep(paste(Cancer.keys, collapse="|"), gseaDat.ranks.df$pathway, value=TRUE))

# Add color codes
gseaDat.ranks.df <- gseaDat.ranks.df %>%
                            mutate(class = case_when(pathway %in% DNA.repair.replication.pathway ~ "DNA repair and replication",
                                                     pathway %in% Glucose.metabolism.pathway ~ "Carbohydrate metabolism",
                                                     pathway %in% Cancer.pathway ~ "Cancer and cell proliferation",
                                                     T ~ "others"))

gseaDat.ranks.plot <- gseaDat.ranks.df %>% ggplot(aes(x = rank, y = NES, fill = class)) +
  geom_bar(stat = "identity") + xlab("Pathway ranks") + ylab("Normalised enrichment score") +
  scale_fill_manual(values = c("#504B76", "#F39208", "#E52620", "#D4D2D2")) +
  geom_hline(yintercept=0, linetype="dashed") +
  theme_bw() + theme(aspect.ratio=0.5)
gseaDat.ranks.plot

pdf("./Rplots/PACA.fgseaKEGG.plot.pdf", width=7, height=6, useDingbats=FALSE)
gseaDat.ranks.plot
dev.off()
## quartz_off_screen 
##                 2

Prepare GSEA plots for pathway groups

library(ggpubr)
# r

# Recover gene id associated with pathway groups
Cancer.genes <- as.vector(unlist(pathwaysKEGG[names(pathwaysKEGG) %in% Cancer.pathway]))
Glucose.metabolism.genes <- as.vector(unlist(pathwaysKEGG[names(pathwaysKEGG) %in% Glucose.metabolism.pathway]))
DNA.repair.replication.genes <- as.vector(unlist(pathwaysKEGG[names(pathwaysKEGG) %in% DNA.repair.replication.pathway]))

# Combine p-values with the Fisher's method
Cancer.pval <- (PACA1.vs.PACA3.fgseaKEGG %>% filter(pathway %in% Cancer.pathway))$pval
    Cancer.combine.pval <- fishersMethod(Cancer.pval)
Glucose.metabolism.pval <- (PACA1.vs.PACA2.fgseaKEGG %>% filter(pathway %in% Glucose.metabolism.pathway))$pval
    Glucose.metabolism.combine.pval <- fishersMethod(Glucose.metabolism.pval)
DNA.repair.replication.pval <- (PACA1.vs.PACA2.fgseaKEGG %>% filter(pathway %in% DNA.repair.replication.pathway))$pval
    DNA.repair.replication.combine.pval <- fishersMethod(DNA.repair.replication.pval)

# Plot enrichment
cancer.plot <- plotEnrichment(Cancer.genes, PACA1.vs.PACA3.gseaDat.ranks) + labs(title=paste("Cancer and cell proliferation | Pval =", signif(Cancer.combine.pval, 4))) + theme_bw() + theme(aspect.ratio=0.5)

gluc.plot <- plotEnrichment(Glucose.metabolism.genes, PACA1.vs.PACA2.gseaDat.ranks) + labs(title=paste("Glucose metabolism | Pval =", signif(Glucose.metabolism.combine.pval, 4))) + theme_bw() + theme(aspect.ratio=0.5)

repair.plot <- plotEnrichment(DNA.repair.replication.genes, PACA1.vs.PACA2.gseaDat.ranks) + labs(title=paste("DNA repair and replication | Pval =", signif(DNA.repair.replication.combine.pval, 4))) + theme_bw() + theme(aspect.ratio=0.5) # pval = 0.017496635

pdf("./Rplots/PACA.fgseaKEGG.gsea.plot.pdf", width=6, height=10, useDingbats=FALSE)
ggarrange(cancer.plot, gluc.plot, repair.plot, nrow = 3)
dev.off()
GSEA plots

GSEA plots

The pathways related to DNA repair and Glucose metabolism are further charaterised by decreasing the level of analysis. Sub-pathways were manually retrieved from https://reactome.org/PathwayBrowser/.

# r

# 14 DNA repair and replication pathways were defined

# Define path
DDR.reactome.files <- list.files("./Dataset/Reactome/DNA_repair/")
DDR.reactome.path <- paste("./Dataset/Reactome/DNA_repair", DDR.reactome.files, sep = "/")
DDR.reactome.path.names <- gsub("\\..*", "", DDR.reactome.files)
DDR.reactome.path.names <- gsub("_", " ", DDR.reactome.path.names)
# Recover gene.id
DDR.reactome.df <- tibble()
for (i in 1:length(DDR.reactome.path)) {
  df.i <- read.table(DDR.reactome.path[i], skip = 1) %>% mutate(V3 = DDR.reactome.path.names[i])
  DDR.reactome.df <- rbind(DDR.reactome.df, df.i)
}
DDR.reactome.df <- DDR.reactome.df %>% dplyr::select(REact.name = V3, gene.id = V2) %>% left_join(hsapiens.coding.genes.2, by = "gene.id") %>% distinct()
DDR.reactome.df <- DDR.reactome.df %>% dplyr::select(-gene.id)
# Prepare list
DDR.reactome.list.df <- DDR.reactome.df %>% group_by(REact.name) %>% summarise(ncbi.gene.id = list(ncbi.gene.id))
DDR.reactome <- DDR.reactome.list.df$ncbi.gene.id
names(DDR.reactome) <- unique(DDR.reactome.list.df$REact.name)
# Perform gsea
fgseaDDR.reactome.PACA1.vs.PACA2 <- fgsea(DDR.reactome, PACA1.vs.PACA2.gseaDat.ranks, eps = 0.0)
fgseaDDR.reactome.PACA1.vs.PACA3 <- fgsea(DDR.reactome, PACA1.vs.PACA3.gseaDat.ranks, eps = 0.0)
# Combine pvalues
fgseaDDR.reactome.combine.pval <- vector()
for (i in 1:length(unique(fgseaDDR.reactome.PACA1.vs.PACA3$pathway))) {
  pathway.i <- unique(fgseaDDR.reactome.PACA1.vs.PACA3$pathway)[i]
    pval.1.i <- (fgseaDDR.reactome.PACA1.vs.PACA2 %>% filter(pathway == pathway.i))$pval
    pval.2.i <- (fgseaDDR.reactome.PACA1.vs.PACA3 %>% filter(pathway == pathway.i))$pval
  pval.fisher.i <- fishersMethod(c(pval.1.i, pval.2.i))
  fgseaDDR.reactome.combine.pval <- append(fgseaDDR.reactome.combine.pval, pval.fisher.i)
}
fgseaDDR.reactome.df <- cbind.data.frame(pathway = unique(fgseaDDR.reactome.PACA1.vs.PACA3$pathway), pval = fgseaDDR.reactome.combine.pval) %>% left_join((fgseaDDR.reactome.PACA1.vs.PACA2 %>% dplyr::select(pathway, size, leadingEdge)), by = "pathway")
# add information on number of hits
hits.count <- vector()
for (i in 1:nrow(fgseaDDR.reactome.df)) {
  leadingEdge.i <- fgseaDDR.reactome.df[i,]$leadingEdge
  hits.count.i <- length(unlist(leadingEdge.i))
  hits.count <- c(hits.count, hits.count.i)
}
fgseaDDR.reactome.df$hits.count <- hits.count
saveRDS(fgseaDDR.reactome.df, "./003_Transcriptomics_cluster_analysis/rds/fgseaDDR.reactome.df.rds")

# 11 carbohydrate metabolic pathways were defined

# Define path
Gluc.reactome.files <- list.files("./Dataset/Reactome/Glucose/")
Gluc.reactome.path <- paste("./Dataset/Reactome/Glucose/", Gluc.reactome.files, sep = "/")
Gluc.reactome.path.names <- gsub("\\..*", "", Gluc.reactome.files)
Gluc.reactome.path.names <- gsub("_", " ", Gluc.reactome.path.names)
# Recover gene.id
Gluc.reactome.df <- tibble()
for (i in 1:length(Gluc.reactome.path)) {
  df.i <- read.table(Gluc.reactome.path[i], skip = 1) %>% mutate(V3 = Gluc.reactome.path.names[i])
  Gluc.reactome.df <- rbind(Gluc.reactome.df, df.i)
}
Gluc.reactome.df <- Gluc.reactome.df %>% dplyr::select(REact.name = V3, gene.id = V2) %>% left_join(hsapiens.coding.genes.2, by = "gene.id") %>% distinct()
Gluc.reactome.df <- Gluc.reactome.df %>% dplyr::select(-gene.id)
# Prepare list
Gluc.reactome.list.df <- Gluc.reactome.df %>% group_by(REact.name) %>% summarise(ncbi.gene.id = list(ncbi.gene.id))
Gluc.reactome <- Gluc.reactome.list.df$ncbi.gene.id
names(Gluc.reactome) <- unique(Gluc.reactome.list.df$REact.name)
# Perform gsea
fgseaGluc.reactome.PACA1.vs.PACA2 <- fgsea(Gluc.reactome, PACA1.vs.PACA2.gseaDat.ranks, eps = 0.0)
fgseaGluc.reactome.PACA1.vs.PACA3 <- fgsea(Gluc.reactome, PACA1.vs.PACA3.gseaDat.ranks, eps = 0.0)
# Combine pvalues
fgseaGluc.reactome.combine.pval <- vector()
for (i in 1:length(unique(fgseaGluc.reactome.PACA1.vs.PACA3$pathway))) {
  pathway.i <- unique(fgseaGluc.reactome.PACA1.vs.PACA3$pathway)[i]
  pval.1.i <- (fgseaGluc.reactome.PACA1.vs.PACA2 %>% filter(pathway == pathway.i))$pval
  pval.2.i <- (fgseaGluc.reactome.PACA1.vs.PACA3 %>% filter(pathway == pathway.i))$pval
  pval.fisher.i <- fishersMethod(c(pval.1.i, pval.2.i))
  fgseaGluc.reactome.combine.pval <- append(fgseaGluc.reactome.combine.pval, pval.fisher.i)
}
fgseaGluc.reactome.df <- cbind.data.frame(pathway = unique(fgseaGluc.reactome.PACA1.vs.PACA3$pathway), pval = fgseaGluc.reactome.combine.pval) %>% left_join((fgseaGluc.reactome.PACA1.vs.PACA2 %>% dplyr::select(pathway, size, leadingEdge)), by = "pathway")
# add information on number of hits
hits.count <- vector()
for (i in 1:nrow(fgseaGluc.reactome.df)) {
  leadingEdge.i <- fgseaGluc.reactome.df[i,]$leadingEdge
  hits.count.i <- length(unlist(leadingEdge.i))
  hits.count <- c(hits.count, hits.count.i)
}
fgseaGluc.reactome.df$hits.count <- hits.count
saveRDS(fgseaGluc.reactome.df, "./003_Transcriptomics_cluster_analysis/rds/fgseaGluc.reactome.df.rds")

Plot

library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# DDR
fgseaDDR.reactome.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/fgseaDDR.reactome.df.rds")
fgseaDDR.reactome.plot <- fgseaDDR.reactome.df %>% 
  mutate(hitsPerc=hits.count*100/size, log10.pval = -log10(pval)) %>%
  arrange(desc(-log10.pval)) %>%
  mutate(PATH = factor(pathway, levels = unique(pathway))) %>% 
  ggplot(aes(x= log10.pval,
             y= PATH,
             size=size)) +
  geom_point(color = "#4F4A75") + xlim(0,25) + xlab("-log10 Pval") +
  theme_bw() + theme(aspect.ratio=3, axis.title.y=element_blank())
fgseaDDR.reactome.plot

# Carbohydrate
fgseaGluc.reactome.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/fgseaGluc.reactome.df.rds")
fgseaGluc.reactome.plot <- fgseaGluc.reactome.df %>% 
  mutate(hitsPerc=hits.count*100/size, log10.pval = -log10(pval)) %>%
  arrange(desc(-log10.pval)) %>%
  mutate(PATH = factor(pathway, levels = unique(pathway))) %>% 
  ggplot(aes(x= log10.pval,
             y= PATH,
             size=size)) +
  geom_point(color = "#4F4A75") + xlim(0,8) + xlab("-log10 Pval") +
  theme_bw() + theme(aspect.ratio=3, axis.title.y=element_blank())
fgseaGluc.reactome.plot

pdf("./Rplots/PACA.DDR.Gluc.stat.plot.pdf", width=12, height=6, useDingbats=FALSE)
ggarrange(fgseaDDR.reactome.plot, fgseaGluc.reactome.plot, nrow = 1)
dev.off()
## quartz_off_screen 
##                 2

3.3 Dysregulation of glycolysis and PPP pathways

The gene set enrichment analysis revealed that both glycolysis and pentose phosphate pathway (PPP) pathways are dysregulated in PACA 1. Although PPP and glycolysis are distinct, they involve three common intermediates, glucose 6-phosphate, glyceraldehyde 3-phosphate, and fructose 6-phosphate, so the two pathways are interconnected. It worth noting that PPP is responsible for the generation of ribose 5-phosphate for nucleotide synthesis. We are, herein, asssessing which pathway is the more affected in order to establish how dysregulation of glucose metabolism may affect mutational burden at origins.

Assess the corelation between gene expression and snvs burden at origins for genes involved in both pathways

# We consider all genes previously associated to carbohydrate metabolism (Glucose.metabolism.genes)
# Recover gene id
Glucose.metabolism.gene.id <- (hsapiens.coding.genes.2 %>% filter(ncbi.gene.id %in% Glucose.metabolism.genes))$gene.id # 469 genes

# Load gene expression data
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")

# Select donor from PACA clusters
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")
PACA.clusters.summary.light.df <- PACA.clusters.summary.df %>% dplyr::select(donor, cluster) %>% distinct()

# Filter expression data
PACA.transc.TPM.df <- ICGC.transc.TPM.df %>% filter(donor %in% PACA.clusters.summary.light.df$donor) %>% left_join(PACA.clusters.summary.light.df, by = "donor")

# Compute Pearson correlations
# Only genes with TPM >= 1 and corelation with more than 10 observations are computed
PACA.cor.TPM.df <- tibble()
for (i in 1:length(Glucose.metabolism.gene.id)) {
  GOI.i <- Glucose.metabolism.gene.id[i]
  print(i/length(Glucose.metabolism.gene.id))
  TPM.df.i <- PACA.transc.TPM.df %>% filter(gene.id == GOI.i & TPM >= 1) 
  if(nrow(TPM.df.i) <= 10){cor.df.i <- tibble()} else {
  cor.i <- cor.test(log10(TPM.df.i$TPM), TPM.df.i$snvs.dens.ratio)
    Rho.i <- as.numeric(cor.i$estimate)
    pval.i <- as.numeric(cor.i$p.value)
  cor.df.i <- cbind.data.frame(GOI = GOI.i, Rho = Rho.i, pval = pval.i)}  
  PACA.cor.TPM.df <- rbind(PACA.cor.TPM.df, cor.df.i) 
}
# Label genes unique to the glycolysis, ppp and nucleotide biosynthesis pathways
glycolysis.genes <- read.table("./Dataset/Reactome/Glucose/Glycolysis.HSA-70326.tsv")$MoleculeName
ppp.genes <- read.table("./Dataset/Reactome/Glucose/Pentose_phosphate_pathway.HSA-71336.tsv")$MoleculeName
nucleotide.genes <- read.table("./Dataset/Reactome/Glucose/Nucleotide_biosynthesis.HSA-8956320.tsv")$MoleculeName
    glycolysis.genes <- glycolysis.genes[!glycolysis.genes %in% ppp.genes] # 80 genes
    ppp.genes <- ppp.genes[!ppp.genes %in% glycolysis.genes] # 15 genes
PACA.cor.TPM.df <- PACA.cor.TPM.df %>% mutate(pathway = case_when(GOI %in% glycolysis.genes ~ "Glycolysis",
                                                                  GOI %in% ppp.genes ~ "Pentose phosphate pathway",
                                                                  GOI %in% nucleotide.genes ~ "Nucleotide biosynthesis",
                                                                  T ~ "others")) %>% distinct()
# Save
saveRDS(PACA.cor.TPM.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.cor.TPM.df.rds")

# Prepare df for plotting relevant examples
PACA.transc.ATIC.df <- PACA.transc.TPM.df %>% filter(gene.id == "ATIC" & TPM >= 10)
  saveRDS(PACA.transc.ATIC.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.transc.ATIC.df.rds")
PACA.transc.TALD01.df <- PACA.transc.TPM.df %>% filter(gene.id == "TALDO1" & TPM >= 10)
  saveRDS(PACA.transc.TALD01.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.transc.TALD01.df.rds")

Plot

library(dplyr)
library(ggplot2)
library(ggpubr)
library(forcats)
setwd("/Users/pm23/Desktop/Projects/OriCan")

# Correlation plot
PACA.cor.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.cor.TPM.df.rds")
PACA.cor.TPM.plot <- PACA.cor.TPM.df %>% arrange(factor(pathway, levels = c("others", "Glycolysis", "Pentose phosphate pathway", "Nucleotide biosynthesis"))) %>% 
  mutate(pathway = fct_inorder(pathway)) %>%
  ggplot(aes(x=Rho, y=-log10(pval), color=pathway)) +
  geom_point() + xlim(-0.5,0.5) + xlab("Pearson correlation") + ylab("log10 Pval") +
  scale_color_manual(values = c("#D4D2D2", "#504B76", "#F39208", "#E52620")) +
  theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
PACA.cor.TPM.plot

# Selected examples
# ATIC
PACA.transc.ATIC.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.transc.ATIC.df.rds")
PACA.transc.ATIC.plot <- PACA.transc.ATIC.df %>% ggplot(aes(x=TPM, y=snvs.dens.ratio)) +
  geom_point(shape = 21, fill = "white") +
  geom_smooth(method = lm, se = T, color = "#504B76") +
  scale_x_continuous(trans = log10_trans()) +
  xlab("Expression (log10 TPM)") + ylab("Mutation density ratio (ori/flanks)") + ggtitle("ATIC, Rho = -0.403, Pval = 6.352e-09") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
PACA.transc.ATIC.plot

# TALD01
PACA.transc.TALD01.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.transc.TALD01.df.rds")
PACA.transc.TALD01.plot <- PACA.transc.TALD01.df %>% ggplot(aes(x=TPM, y=snvs.dens.ratio)) +
  geom_point(shape = 21, fill = "white") +
  geom_smooth(method = lm, se = T, color = "#504B76") +
  scale_x_continuous(trans = log10_trans()) +
  xlab("Expression (log10 TPM)") + ylab("Mutation density ratio (ori/flanks)") + ggtitle("TALD01, Rho = -0.372, Pval = 9.055e-08") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
PACA.transc.TALD01.plot

pdf("./Rplots/PACA.PPP.plot.pdf", width=12, height=5, useDingbats=FALSE)
ggarrange(PACA.cor.TPM.plot, PACA.transc.ATIC.plot, PACA.transc.TALD01.plot, nrow = 1)
dev.off()
## quartz_off_screen 
##                 2

3.4 Biomarkers of replication stress

Previous results suggest that PACA 1 defines a subset of tumours characterised by general replicative stress (oncogene activation, DNA repair defficiency and altered glucose metabolism). In this section, we aim to characterise further the replication stress signature associated with each PACA cluster. We then assess the expression of known biomarkers of replicative stress in pancreatic cancer (Dreyer et al. Gastroenterology, 2021, 160, 362-377 - https://www.gastrojournal.org/article/S0016-5085(20)35229-X/fulltext).

# Define biomarkers (according to Figure 3E)
PACA.biomarkers <- c("KRAS", "CCND2", "NPM1", "MYC", "CCND1", "MDM2", "STAT5A", "HRAS", "CCNE1", "CCND3", "NRAS", "DEK", "CCNE2", "CDC6", "CDCA5", "AURKA", "MYBL2", "E2F1") # 18 genes

# Recover gene expression data and normalise expression (Z-score)
PACA.transc.biom.TPM.df <- PACA.transc.TPM.df %>% filter(gene.id %in% PACA.biomarkers) %>% 
  group_by(gene.id) %>% mutate(zscore = (raw.count-mean(raw.count, na.rm = T))/sd(raw.count, na.rm = T)) %>% ungroup()

# Compute mean scores for ordering heatmap
mean.zscore <- PACA.transc.biom.TPM.df %>% group_by(donor) %>% summarise(mean = mean(zscore)) %>% mutate(rank = dense_rank(-dplyr::desc(mean))) %>% arrange(rank) %>% dplyr::select(donor, rank)

# Order df
PACA.transc.biom.order.df <- PACA.transc.biom.TPM.df %>% dplyr::select(donor, gene.id, zscore, cluster) %>% left_join(mean.zscore, by = "donor")
# donor
PACA.transc.biom.order.df$donor <- factor(PACA.transc.biom.order.df$donor, levels=unique((PACA.transc.biom.order.df$donor)[order(PACA.transc.biom.order.df$rank)]))

# Prepare facets for plotting
PACA.transc.biom.order.df$facet <- ifelse(PACA.transc.biom.order.df$cluster %in% c("clust.1", "clust.2", "clust.3"), PACA.transc.biom.order.df$cluster)

# Save
saveRDS(PACA.transc.biom.order.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.transc.biom.order.df.rds")

Plot heatmap

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

PACA.transc.biom.order.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.transc.biom.order.df.rds")
PACA.transc.biom.plot <- PACA.transc.biom.order.df %>% ggplot(aes(donor, gene.id, fill= zscore)) + 
  geom_tile() +
  scale_fill_gradientn(colours = c("#4F4A75", "white", "#E52620"), limits=c(-2, 2), na.value = '#E52620') +
  theme_bw() + theme(axis.title.y=element_blank(), axis.text.x=element_text(angle=45,hjust=1)) +
  facet_grid(.~facet, space = "free", scales = "free_x")
PACA.transc.biom.plot

pdf("./Rplots/PACA.transc.biom.plot.pdf", width=8, height=3, useDingbats=FALSE)
PACA.transc.biom.plot
dev.off()
## quartz_off_screen 
##                 2

This analysis suggests: - cluster 1 and 2 are characterised by high replication stress (oncogene overexpression). - cluster 2 show higher expression for cyclins D. Both tumours in cluster 1 and 2 diplay features of squamous carcinoma.

3.5 Cell proliferation and cell cycle dysregulation

This section aims to assess whether cell proliferation and cell cycle are dysregulated in the different identified PACA clusters.

Cell proliferation is assessed by looking at cyclin D1 expression.

# r

# Recover gene expression
PACA.CCND1.TPM.df <- PACA.transc.TPM.df %>% filter(gene.id == "CCND1")
saveRDS(PACA.CCND1.TPM.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.CCND1.TPM.df.rds")

# Compute stats

# Comparing clusters
ks.test(PACA.CCND1.TPM.df[which(PACA.CCND1.TPM.df$cluster == "clust.1"),]$TPM, PACA.CCND1.TPM.df[which(PACA.CCND1.TPM.df$cluster == "clust.2"),]$TPM) # p-value = 0.343
ks.test(PACA.CCND1.TPM.df[which(PACA.CCND1.TPM.df$cluster == "clust.2"),]$TPM, PACA.CCND1.TPM.df[which(PACA.CCND1.TPM.df$cluster == "clust.3"),]$TPM) # p-value < 2.2e-16

# Correlation between CCND1 expression and mutational burden at origins
cor.test(log10(PACA.CCND1.TPM.df$TPM), PACA.CCND1.TPM.df$snvs.dens.ratio) # 0.4046014, p-value = 4.882e-09

Plot

library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")

PACA.CCND1.TPM.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.CCND1.TPM.df.rds")
PACA.CCND1.plot <- PACA.CCND1.TPM.df %>%
  ggplot(aes(x=cluster, y=TPM, fill=cluster)) +
  geom_boxplot(alpha = 0.6) + ggtitle("CCND1, Pval = 0.343, < 2.2e-16") +
  scale_y_continuous(trans = 'log10', limits = c(0.05, 2000)) +
  scale_fill_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) +
  theme_bw() + theme(aspect.ratio=2, legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
PACA.CCND1.plot

PACA.CCND1.snvs.plot <- PACA.CCND1.TPM.df %>% ggplot(aes(x=TPM, y=snvs.dens.ratio)) +
  geom_point(shape = 21, fill = "white") +
  geom_smooth(method = lm, se = T, color = "#504B76") +
  scale_x_continuous(trans = log10_trans()) +
  xlab("Expression (log10 TPM)") + ylab("Mutation density ratio (ori/flanks)") + ggtitle("CCND1, Rho = 0.404, Pval = 4.882e-09") +
  geom_hline(yintercept=1, linetype="dashed") +
  theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
PACA.CCND1.snvs.plot

pdf("./Rplots/PACA.CCND1.plot.pdf", width=8, height=6, useDingbats=FALSE)
ggarrange(PACA.CCND1.plot, PACA.CCND1.snvs.plot, nrow = 1)
dev.off()
## quartz_off_screen 
##                 2

Cell cycle is estimated by the ratio of cyclin expression: - cyclin E / others: G1/S - cyclin A / others: S/G2 - cyclin B / others : G2/M

#r

# Select cyclin encoding genes
cyclins <- c("CCNA1", "CCNA2", "CCNB1", "CCNB2", "CCND1", "CCND2", "CCND3", "CCNE1", "CCNE2")

# Recover expression data and format df
PACA.transc.TPM.spread.df <- PACA.transc.TPM.df %>% dplyr::select(donor, gene.id, TPM) %>% distinct() %>% pivot_wider(names_from = gene.id, values_from = TPM, values_fn = first)
PACA.cyclin.TPM.df <- PACA.transc.TPM.spread.df %>% dplyr::select(all_of(cyclins))

# Compute expression ratios
PACA.cell.cycle.df <- PACA.cyclin.TPM.df %>% mutate(SUM = CCNA1 + CCNA2 + CCNB1 + CCNB2 + CCNE1 + CCNE2) %>%
  mutate(G1_S = (CCNE1 + CCNE2)/SUM, S_G2 = (CCNA1 + CCNA2)/SUM, G2_M = (CCNB1 + CCNB2)/SUM)

# Normalise each cycle
PACA.cell.cycle.df <- PACA.cell.cycle.df %>% dplyr::select(donor, G1_S, S_G2, G2_M) %>% 
  mutate(cell.sum = G1_S + S_G2 + G2_M,
         G1_S = G1_S/cell.sum,
         S_G2 = S_G2/cell.sum,
         G2_M = G2_M/cell.sum) %>% dplyr::select(-cell.sum)

# Add cluster information
PACA.cell.cycle.df <- PACA.cell.cycle.df %>% left_join((PACA.clusters.summary.df %>% dplyr::select(donor, cluster)), by = "donor")
saveRDS(PACA.cell.cycle.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.cell.cycle.df.rds")

# Compute stats

# G1/S phase
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.1"),]$G1_S, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$G1_S, alternative = "greater") # p-value = 0.007093
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$G1_S, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.3"),]$G1_S, alternative = "greater") # p-value = 0.2203

# S/G2 phase
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.1"),]$S_G2, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$S_G2, alternative = "less") # p-value = 6.198e-08
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$S_G2, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.3"),]$S_G2, alternative = "less") # p-value = 0.7036

# G2/M phase
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.1"),]$G2_M, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$G2_M, alternative = "greater") # p-value = 1.145e-06
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$G2_M, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.3"),]$G2_M, alternative = "greater") # p-value = 0.8931

Plot

library(dplyr)
library(tidyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

PACA.cell.cycle.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.cell.cycle.df.rds")
PACA.cell.cycle.plot <- PACA.cell.cycle.df %>%
  ungroup() %>% dplyr::select(G1_S, S_G2, G2_M, cluster) %>%
  gather("phase", "value", -cluster) %>% mutate(PHASE = fct_relevel(phase, "G1_S", "S_G2", "G2_M")) %>%
  ggplot(aes(x=PHASE, y=value, fill=cluster)) +
  geom_boxplot(alpha = 0.6) + ylab("Fraction") + ylim(0, 0.8) +
  scale_fill_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) +
  theme_bw() + theme(aspect.ratio=1, axis.title.x=element_blank())
PACA.cell.cycle.plot

pdf("./Rplots/PACA.cell.cycle.plot.pdf", width=7, height=6, useDingbats=FALSE)
PACA.cell.cycle.plot
dev.off()
## quartz_off_screen 
##                 2

This analysis shows: - PACA 1 and PACA 2 are characterised by increased cell proliferation. - PACA 1 is characterised by a longer S-phase and shorted G1/G2 phases.

4 PACA cluster survival

Estimate cancer aggressiveness from survival analysis

# r
library(survminer)

# Recover donor information
donor.info <- read.table("./Dataset/ICGC/donor.all_projects.tsv", sep = "\t", header = T)
donor.info <- donor.info %>% dplyr::select(donor = icgc_donor_id, survival = donor_survival_time)

# Add cluster information
PACA.donor.info <- donor.info %>% left_join((PACA.clusters.summary.df %>% dplyr::select(donor, cluster)), by = "donor") %>% drop_na() %>% distinct()
saveRDS(PACA.donor.info, "./003_Transcriptomics_cluster_analysis/rds/PACA.donor.info.rds")

# Compute stats
ks.test(PACA.donor.info[which(PACA.donor.info$cluster == "clust.1"),]$survival, PACA.donor.info[which(PACA.donor.info$cluster == "clust.3"),]$survival, alternative = "greater") # p-value = 0.006931
ks.test(PACA.donor.info[which(PACA.donor.info$cluster == "clust.1"),]$survival, PACA.donor.info[which(PACA.donor.info$cluster == "clust.2"),]$survival, alternative = "greater") # p-value = 0.01849

Plot

library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")

PACA.donor.info <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.donor.info.rds")
PACA.donor.survival.plot <- PACA.donor.info %>% ggplot(aes(survival, colour = cluster)) +
  stat_ecdf() + scale_y_reverse() +
  scale_color_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) +
  xlab("Days") + ylab("Percent survival") + xlim(0,3000) + ggtitle("Kaplan-Meier curves | Pval = 0.006931 & 0.01849") +
  theme_bw() + theme(aspect.ratio=1)
PACA.donor.survival.plot

pdf("./Rplots/PACA.donor.survival.plot.pdf", width=7, height=6, useDingbats=FALSE)
PACA.donor.survival.plot
dev.off()
## quartz_off_screen 
##                 2